# Load required libraries
library(tidyverse)
library(irr)        # For Cohen's Kappa
library(psych)      # For additional agreement statistics
library(ggplot2)
library(gridExtra)
library(DescTools)

# Set working directory and file paths
# Adjust these paths as needed
results_dir <- "multi_model_results"
comment_scores_file <- "../../data/comments_score.rds"

# ============================================================================
# 1. LOAD DATA
# ============================================================================

cat("Loading data...\n")

# Load the comparison file (all models in one file)
# Assumes you've processed at least one batch
comparison_files <- list.files(
  path = results_dir, 
  pattern = "_ALL_MODELS_COMPARISON\\.csv$", 
  full.names = TRUE
)

if (length(comparison_files) == 0) {
  stop("No comparison files found in ", results_dir)
}

cat("Found", length(comparison_files), "comparison file(s)\n")

# Load all comparison files and combine
all_data <- map_dfr(comparison_files, read_csv, show_col_types = FALSE)

cat("Total comments loaded:", nrow(all_data), "\n\n")

# Load comment scores (if available)
if (file.exists(comment_scores_file)) {
  comment_scores <- readRDS(comment_scores_file)
  cat("Loaded comment scores from RDS file\n")
  
  # Merge if comment_id exists in both
  if ("comment_id" %in% names(comment_scores)) {
    all_data <- all_data %>%
      left_join(comment_scores, by = "comment_id")
    cat("Merged comment scores with results\n")
  }
} else {
  cat("Comment scores file not found, skipping merge\n")
}

# ============================================================================
# 2. IDENTIFY MODEL COLUMNS
# ============================================================================

# Find all discusses_violence columns (one per model)
violence_cols <- names(all_data)[str_detect(names(all_data), "^discusses_violence_")]
score_cols <- names(all_data)[str_detect(names(all_data), "^violence_score_")]

# Extract model names
model_names <- str_remove(violence_cols, "^discusses_violence_")

cat("\nModels found:\n")
for (i in seq_along(model_names)) {
  cat(sprintf("  %d. %s\n", i, model_names[i]))
}

# ============================================================================
# 3. PREPARE DATA FOR ANALYSIS
# ============================================================================

# Convert TRUE/FALSE to binary (1/0) for analysis
violence_binary <- all_data %>%
  select(comment_id, all_of(violence_cols), scalar_sum) %>%
  mutate(across(all_of(violence_cols), ~as.numeric(as.logical(.))))

violence_binary$scalar_sum <-
  as.numeric(violence_binary$scalar_sum>=1)

# Remove rows with any NA values
violence_complete <- violence_binary %>%
  drop_na()

cat("\nComplete cases for analysis:", nrow(violence_complete), "\n")
cat("Removed", nrow(violence_binary) - nrow(violence_complete), "rows with missing values\n\n")

# ============================================================================
# 4. CALCULATE COHEN'S KAPPA (PAIRWISE)
# ============================================================================

# Create matrix to store kappa values
n_models <- length(violence_cols) + 1
kappa_matrix <- matrix(NA, nrow = n_models, ncol = n_models)
rownames(kappa_matrix) <- c(model_names, "scalar_sum")
colnames(kappa_matrix) <- c(model_names, "scalar_sum")

violence_cols_plus_one <- 
  c(violence_cols, "scalar_sum")


# Calculate pairwise Cohen's Kappa
for (i in 1:n_models) {
  for (j in 1:n_models) {
    if (i == j) {
      kappa_matrix[i, j] <- 1.0  # Perfect agreement with itself
    } else if (i < j) {
      # Calculate kappa
      kappa_result <- kappa2(violence_complete[, c(violence_cols_plus_one[i], violence_cols_plus_one[j])])
      kappa_matrix[i, j] <- kappa_result$value
      kappa_matrix[j, i] <- kappa_result$value
    }
  }
}

# Print kappa matrix
cat("Cohen's Kappa Matrix (pairwise agreement):\n")
print(round(kappa_matrix, 3))
write.csv(kappa_matrix, file = "kappa_matrix.csv", row.names = F)

# Interpretation guide
cat("\nInterpretation of Kappa values:\n")
cat("  < 0.00  = Poor agreement\n")
cat("  0.00-0.20 = Slight agreement\n")
cat("  0.21-0.40 = Fair agreement\n")
cat("  0.41-0.60 = Moderate agreement\n")
cat("  0.61-0.80 = Substantial agreement\n")
cat("  0.81-1.00 = Almost perfect agreement\n\n")

# ============================================================================
# 5. OVERALL AGREEMENT STATISTICS
# ============================================================================


# Fleiss' Kappa (for more than 2 raters)
if (n_models > 2) {
  fleiss_result <- kappam.fleiss(violence_complete[, violence_cols])
  cat("Fleiss' Kappa (overall agreement across all models):\n")
  cat(sprintf("  Kappa = %.3f\n", fleiss_result$value))
  cat(sprintf("  p-value = %.4f\n\n", fleiss_result$p.value))
}

model_names_plus_one <- c(model_names, "scalar_sum")

# Calculate agreement rates
sink("agreement_rates.txt")
cat("Raw Agreement Rates:\n")
for (i in 1:(n_models-1)) {
  for (j in (i+1):n_models) {
    agreement <- mean(violence_complete[[violence_cols_plus_one[i]]] == violence_complete[[violence_cols_plus_one[j]]])
    cat(sprintf("  %s vs %s: %.1f%%\n", 
                model_names_plus_one[i], model_names_plus_one[j], agreement * 100))
  }
}
sink()

# ============================================================================
# 6. SCALE TRANSFORMATION METHODS
# ============================================================================

# Create a working dataset with complete cases
analysis_data <- all_data %>%
  select(comment_id, text_original, scalar_sum, all_of(score_cols)) %>%
  drop_na()

cat(sprintf("Complete cases for analysis: %d\n\n", nrow(analysis_data)))

# Method 1: Z-score standardization (mean=0, sd=1)
cat("Method 1: Z-score standardization\n")
analysis_data <- analysis_data %>%
  mutate(
    scalar_sum_zscore = scale(scalar_sum)[,1],
    across(all_of(score_cols), 
           ~scale(.)[,1], 
           .names = "{.col}_zscore")
  )

# Method 2: Min-Max normalization to 0-10 range
cat("Method 2: Min-Max scaling (0-10)\n")
analysis_data <- analysis_data %>%
  mutate(
    scalar_sum_minmax = (scalar_sum - min(scalar_sum)) / 
      (max(scalar_sum) - min(scalar_sum)) * 10
  )

# Method 3: Rank transformation (percentile ranking)
cat("Method 3: Rank transformation\n")
analysis_data <- analysis_data %>%
  mutate(
    scalar_sum_rank = rank(scalar_sum, ties.method = "average"),
    across(all_of(score_cols), 
           ~rank(., ties.method = "average"), 
           .names = "{.col}_rank")
  )


# ============================================================================
# 7. CORRELATION COMPARISONS ACROSS TRANSFORMATIONS
# ============================================================================

correlation_results <- tibble(
  Model = character(),
  Method = character(),
  Pearson_r = numeric(),
  Spearman_rho = numeric()
)

for (i in seq_along(model_names)) {
  model <- model_names[i]
  score_col <- score_cols[i]
  
  # Original scales
  correlation_results <- correlation_results %>%
    add_row(
      Model = model,
      Method = "Original",
      Pearson_r = cor(analysis_data$scalar_sum, 
                      analysis_data[[score_col]], 
                      method = "pearson"),
      Spearman_rho = cor(analysis_data$scalar_sum, 
                         analysis_data[[score_col]], 
                         method = "spearman")
    )
  
  # Z-score
  correlation_results <- correlation_results %>%
    add_row(
      Model = model,
      Method = "Z-score",
      Pearson_r = cor(analysis_data$scalar_sum_zscore, 
                      analysis_data[[paste0(score_col, "_zscore")]], 
                      method = "pearson"),
      Spearman_rho = cor(analysis_data$scalar_sum_zscore, 
                         analysis_data[[paste0(score_col, "_zscore")]], 
                         method = "spearman")
    )
  
  # Min-Max
  correlation_results <- correlation_results %>%
    add_row(
      Model = model,
      Method = "Min-Max",
      Pearson_r = cor(analysis_data$scalar_sum_minmax, 
                      analysis_data[[score_col]], 
                      method = "pearson"),
      Spearman_rho = cor(analysis_data$scalar_sum_minmax, 
                         analysis_data[[score_col]], 
                         method = "spearman")
    )
  
  # Rank
  correlation_results <- correlation_results %>%
    add_row(
      Model = model,
      Method = "Rank",
      Pearson_r = cor(analysis_data$scalar_sum_rank, 
                      analysis_data[[paste0(score_col, "_rank")]], 
                      method = "pearson"),
      Spearman_rho = cor(analysis_data$scalar_sum_rank, 
                         analysis_data[[paste0(score_col, "_rank")]], 
                         method = "spearman")
    )
  
}

print(correlation_results)

write.csv(correlation_results, file = "correlation_results.csv", row.names = F)


# ============================================================================
# 8. PAIRWISE ICC: scalar_sum vs Each LLM Model
# ============================================================================

# Initialize results table
icc_results <- tibble(
  Model = character(),
  ICC_zscore = numeric(),
  ICC_zscore_lower = numeric(),
  ICC_zscore_upper = numeric(),
  ICC_zscore_pvalue = numeric(),
  ICC_minmax = numeric(),
  ICC_minmax_lower = numeric(),
  ICC_minmax_upper = numeric(),
  ICC_minmax_pvalue = numeric(),
  Status = character()
)

# Calculate ICC for each model
for (i in seq_along(model_names)) {
  model <- model_names[i]
  score_col <- score_cols[i]
  
  cat(sprintf("\n%s\n", model))
  cat(strrep("-", nchar(model)), "\n")
  
  # Method 1: Z-score standardized
  zscore_data <- analysis_data %>%
    select(scalar_sum_zscore, paste0(score_col, "_zscore")) %>%
    filter(if_all(everything(), is.finite))
  
  icc_z <- tryCatch({
    icc(zscore_data, 
        model = "twoway", 
        type = "agreement", 
        unit = "single")
  }, error = function(e) {
    list(value = NA, lbound = NA, ubound = NA, p.value = NA)
  })
  
  # Method 2: Min-Max scaled (both 0-10)
  minmax_data <- analysis_data %>%
    select(scalar_sum_minmax, !!sym(score_col)) %>%
    filter(if_all(everything(), is.finite))
  
  icc_mm <- tryCatch({
    icc(minmax_data, 
        model = "twoway", 
        type = "agreement", 
        unit = "single")
  }, error = function(e) {
    list(value = NA, lbound = NA, ubound = NA, p.value = NA)
  })
  
  # Determine status
  status <- "OK"
  if (is.na(icc_z$value) && is.na(icc_mm$value)) {
    status <- "Failed"
  } else if (is.na(icc_z$value) || is.na(icc_mm$value)) {
    status <- "Partial"
  }
  
  # Print results
  if (!is.na(icc_z$value)) {
    cat(sprintf("  Z-score ICC: %.3f [%.3f, %.3f], p=%.4f\n",
                icc_z$value, icc_z$lbound, icc_z$ubound, icc_z$p.value))
  } else {
    cat("  Z-score ICC: Failed to calculate\n")
  }
  
  if (!is.na(icc_mm$value)) {
    cat(sprintf("  Min-Max ICC: %.3f [%.3f, %.3f], p=%.4f\n",
                icc_mm$value, icc_mm$lbound, icc_mm$ubound, icc_mm$p.value))
  } else {
    cat("  Min-Max ICC: Failed to calculate\n")
  }
  
  # Add to results
  icc_results <- icc_results %>%
    add_row(
      Model = model,
      ICC_zscore = icc_z$value,
      ICC_zscore_lower = icc_z$lbound,
      ICC_zscore_upper = icc_z$ubound,
      ICC_zscore_pvalue = icc_z$p.value,
      ICC_minmax = icc_mm$value,
      ICC_minmax_lower = icc_mm$lbound,
      ICC_minmax_upper = icc_mm$ubound,
      ICC_minmax_pvalue = icc_mm$p.value,
      Status = status
    )
}

# ============================================================================
# 9. SUMMARY TABLE
# ============================================================================

print(icc_results %>% 
        select(Model, ICC_zscore, ICC_minmax, Status))

write.csv(icc_results %>% 
            select(Model, ICC_zscore, ICC_minmax, Status),
          file = "icc_results.csv",
          row.names = F)

cat("\nInterpretation:\n")
cat("  < 0.50 = Poor agreement\n")
cat("  0.50-0.75 = Moderate agreement\n")
cat("  0.75-0.90 = Good agreement\n")
cat("  > 0.90 = Excellent agreement\n\n")




